###Project Overview

Context of the Project: This analysis is vital for improving global aviation safety and efficiency by optimizing navigational aid (navaid) deployment and modernization across diverse airports and regions.

Data Sources: The project uses reliable datasets from global aviation authorities, including airport characteristics, navaid types, frequencies, and magnetic variations, selected for their relevance to air navigation infrastructure.

Main Objective: The project aims to pinpoint and address inefficiencies in navaid coverage, allocation, and calibration to enhance safety, resilience, and operational performance at airports worldwide.

Key Insights: Critical findings highlight underserved regions like Sub-Saharan Africa needing investment, mismatches between navaid complexity and airport traffic, and urgent recalibration needs in high magnetic variation areas like the South Atlantic.

# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(stringr)
library(lubridate)
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(skimr)
library(knitr)
library(DT)

# Set theme for visualizations
theme_set(theme_minimal())

2. Data Cleaning and Enrichment in R

Initial Data Exploration

# Load the dataset
airports_raw <- read_csv("data/airports.csv", show_col_types = FALSE)
# Check for missing values
missing_summary <- airports_raw %>%
  summarise(across(everything(), ~sum(is.na(.))))

# Display missing values summary
missing_summary %>%
  pivot_longer(everything(), names_to = "column", values_to = "missing_count") %>%
  filter(missing_count > 0) %>%
  arrange(desc(missing_count))
## # A tibble: 10 × 2
##    column       missing_count
##    <chr>                <int>
##  1 x16                  76364
##  2 x17                  76364
##  3 x18                  76364
##  4 iata_code            67476
##  5 local_code           43576
##  6 continent            36995
##  7 gps_code             35020
##  8 elevation_ft         14398
##  9 municipality          5051
## 10 iso_country            259
# Data summary using skimr
skim(airports_raw)
Data summary
Name airports_raw
Number of rows 76364
Number of columns 18
_______________________
Column type frequency:
character 11
logical 3
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ident 0 1.00 2 8 0 76364 0
type 0 1.00 6 14 0 7 0
name 0 1.00 2 93 0 72235 0
continent 36995 0.52 2 2 0 6 0
iso_country 259 1.00 2 2 0 244 0
iso_region 0 1.00 4 7 0 2890 0
municipality 5051 0.93 1 61 0 34141 0
scheduled_service 0 1.00 2 3 0 2 0
gps_code 35020 0.54 3 8 0 41344 0
iata_code 67476 0.12 3 3 0 8888 0
local_code 43576 0.43 1 8 0 31311 0

Variable type: logical

skim_variable n_missing complete_rate mean count
x16 76364 0 NaN :
x17 76364 0 NaN :
x18 76364 0 NaN :

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 162822.93 165642.55 2.00 19239.75 40900.50 335611.25 512008.00 ▇▁▁▅▁
latitude_deg 0 1.00 25.70 26.23 -90.00 11.99 35.13 42.67 82.75 ▁▁▂▇▂
longitude_deg 0 1.00 -28.90 86.15 -179.88 -94.07 -69.80 23.84 179.98 ▂▇▂▁▂
elevation_ft 14398 0.81 1303.02 1672.25 -1266.00 207.00 730.00 1617.00 17372.00 ▇▂▁▁▁

Data Cleaning Steps

Performing different data cleaning steps

# Clean column names
airports_clean <- airports_raw %>%
  clean_names()

# Handle missing values and clean data
airports_clean <- airports_clean %>%
  # Handle missing values in key columns
  mutate(
    # Replace missing municipality with "Unknown"
    municipality = replace_na(municipality, "Unknown"),
    
    # Replace missing scheduled service with "no"
    scheduled_service = replace_na(scheduled_service, "no"),
    
    # Clean and standardize airport types
    type = str_to_title(type),
    
    # Clean country codes
    iso_country = str_to_upper(iso_country),
    iso_region = str_to_upper(iso_region),
    
    # Convert GPS codes to uppercase
    gps_code = str_to_upper(gps_code),
    iata_code = str_to_upper(iata_code),
    local_code = str_to_upper(local_code),
    
    # Handle elevation - replace NA with median elevation
    elevation_ft = replace_na(elevation_ft, median(elevation_ft, na.rm = TRUE))
  )

# Check data types and convert as needed
airports_clean <- airports_clean %>%
  mutate(
    # Ensure proper data types
    latitude_deg = as.numeric(latitude_deg),
    longitude_deg = as.numeric(longitude_deg),
    elevation_ft = as.numeric(elevation_ft),
    scheduled_service = factor(scheduled_service, levels = c("yes", "no")),
    type = factor(type)
  )

# Display the cleaning results
cat("Data cleaning completed. Remaining missing values:\n")
## Data cleaning completed. Remaining missing values:
airports_clean %>%
  summarise(across(everything(), ~sum(is.na(.)))) %>%
  pivot_longer(everything(), names_to = "column", values_to = "missing_count") %>%
  filter(missing_count > 0) %>%
  kable()
column missing_count
continent 36995
iso_country 259
gps_code 35020
iata_code 67476
local_code 43576
x16 76364
x17 76364
x18 76364

Data Enrichment

Initial Cleaning & Missing‑Value Handling

  • Fills missing municipality with “Unknown” and scheduled_service with “no”.

  • Title‑cases the type field and uppercases country/region/airport codes.

  • Replaces any missing elevation_ft with the dataset’s median elevation.

  • Coerces latitude, longitude, and elevation to numeric.

  • Turns scheduled_service and type into factors for consistent categorical handling.

  • Feature Engineering: Geographic & Operational Attributes

  • Derives hemisphere_ns and hemisphere_ew from latitude/longitude signs.

  • Categorizes elevation into meaningful bins (Below Sea Level through Very High Elevation).

  • Creates an airport_size label (Large, Medium, Small, Special, Other) based on type.

  • Extracts 2‑letter region_group from iso_region.

  • Assesses coord_precision (“High”, “Medium”, “Low”) by rounding latitude/longitude.

  • Region‑Level Summaries we will Aggregates by region_group and iso_region to compute: Number of airports, Average elevation, Percent offering scheduled service.

Merging in Country & Region Context Joins in country‑level stats (total_airports, scheduled_service_pct) by iso_country. Joins region summaries by iso_region.

On navigational dataset: Drops any rows missing an ident, name, or type (because those are essential). Normalizes the type field to lowercase for consistency. Converts the frequency_khz column to numeric, so it’s ready for quantitative analysis. Creates a new power_category factor (“Low”, “Medium”, “High”, or “Unknown”) based on the original power column if it exists, defaulting to “Unknown” otherwise.

Drops any rows missing an ident or name. Normalizes the airport’s type to lowercase as airport_type. Derives an airport_size category by pattern‑matching on airport_type (“International”, “Heliport”, “Small”, or “Other”). we’re taking both the datasets airport and navaids, tidying their column names, removing incomplete records, coercing key fields into the right formats, and then enriching them with high‑level categories (power bands for navaids; size classes for airports)

# Enrich the dataset with new variables
airports_enriched <- airports_clean %>%
  mutate(
    # Add hemisphere indicators
    hemisphere_ns = case_when(
      latitude_deg >= 0 ~ "Northern",
      latitude_deg < 0 ~ "Southern",
      TRUE ~ "Unknown"
    ),
    hemisphere_ew = case_when(
      longitude_deg >= 0 ~ "Eastern",
      longitude_deg < 0 ~ "Western",
      TRUE ~ "Unknown"
    ),
    
    # Add elevation categories
    elevation_category = case_when(
      elevation_ft < 0 ~ "Below Sea Level",
      elevation_ft <= 1000 ~ "Low Elevation",
      elevation_ft <= 5000 ~ "Medium Elevation",
      elevation_ft <= 10000 ~ "High Elevation",
      elevation_ft > 10000 ~ "Very High Elevation",
      TRUE ~ "Unknown"
    ),
    
    # Add airport size category based on type
    airport_size = case_when(
      type == "Large_airport" ~ "Large",
      type == "Medium_airport" ~ "Medium",
      type == "Small_airport" ~ "Small",
      type %in% c("Heliport", "Seaplane_base", "Closed", "Balloonport") ~ "Special",
      TRUE ~ "Other"
    ),
    
    # Create region groups
    region_group = str_extract(iso_region, "^[A-Z]{2}"),
    
    # Add coordinate precision indicator
    coord_precision = case_when(
      abs(round(latitude_deg, 2) - latitude_deg) < 0.0001 & 
      abs(round(longitude_deg, 2) - longitude_deg) < 0.0001 ~ "High",
      abs(round(latitude_deg, 1) - latitude_deg) < 0.01 & 
      abs(round(longitude_deg, 1) - longitude_deg) < 0.01 ~ "Medium",
      TRUE ~ "Low"
    )
  )

# Show enriched data structure
glimpse(airports_enriched)
## Rows: 76,364
## Columns: 24
## $ id                 <dbl> 6523, 323361, 6524, 6525, 506791, 6526, 322127, 652…
## $ ident              <chr> "00A", "00AA", "00AK", "00AL", "00AN", "00AR", "00A…
## $ type               <fct> Heliport, Small_airport, Small_airport, Small_airpo…
## $ name               <chr> "Total RF Heliport", "Aero B Ranch Airport", "Lowel…
## $ latitude_deg       <dbl> 40.07099, 38.70402, 59.94773, 34.86480, 59.09329, 3…
## $ longitude_deg      <dbl> -74.93369, -101.47391, -151.69252, -86.77030, -156.…
## $ elevation_ft       <dbl> 11, 3435, 450, 820, 80, 237, 1100, 3810, 3038, 87, …
## $ continent          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ iso_country        <chr> "US", "US", "US", "US", "US", "US", "US", "US", "US…
## $ iso_region         <chr> "US-PA", "US-KS", "US-AK", "US-AL", "US-AK", "US-AR…
## $ municipality       <chr> "Bensalem", "Leoti", "Anchor Point", "Harvest", "Ki…
## $ scheduled_service  <fct> no, no, no, no, no, no, no, no, no, no, no, no, no,…
## $ gps_code           <chr> "K00A", "00AA", "00AK", "00AL", "00AN", NA, "00AS",…
## $ iata_code          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ local_code         <chr> "00A", "00AA", "00AK", "00AL", "00AN", NA, "00AS", …
## $ x16                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ x17                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ x18                <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ hemisphere_ns      <chr> "Northern", "Northern", "Northern", "Northern", "No…
## $ hemisphere_ew      <chr> "Western", "Western", "Western", "Western", "Wester…
## $ elevation_category <chr> "Low Elevation", "Medium Elevation", "Low Elevation…
## $ airport_size       <chr> "Special", "Small", "Small", "Small", "Small", "Spe…
## $ region_group       <chr> "US", "US", "US", "US", "US", "US", "US", "US", "US…
## $ coord_precision    <chr> "Low", "Low", "Low", "Low", "Low", "Low", "Low", "L…

3. Data Backups

# Create backup of raw data
airports_backup_raw <- airports_raw

# Create backup of cleaned data
airports_backup_clean <- airports_clean

# Create backup of enriched data
airports_backup_enriched <- airports_enriched

# Save backups to files (optional in real scenario)
write_csv(airports_backup_raw, "backup/airports_raw_backup.csv")
write_csv(airports_backup_clean, "backup/airports_clean_backup.csv")
write_csv(airports_backup_enriched, "backup/airports_enriched_backup.csv")

# Create a log of data transformations
transformation_log <- tibble(
  timestamp = Sys.time(),
  step = c("Raw Data Load", "Data Cleaning", "Data Enrichment"),
  rows = c(nrow(airports_raw), nrow(airports_clean), nrow(airports_enriched)),
  cols = c(ncol(airports_raw), ncol(airports_clean), ncol(airports_enriched)),
  description = c(
    "Initial data load from airports.csv",
    "Cleaned column names, handled missing values, standardized text fields",
    "Added hemisphere indicators, elevation categories, airport size, region groups"
  )
)

transformation_log %>%
  kable(caption = "Data Transformation Log")
Data Transformation Log
timestamp step rows cols description
2025-05-10 02:02:23 Raw Data Load 76364 18 Initial data load from airports.csv
2025-05-10 02:02:23 Data Cleaning 76364 18 Cleaned column names, handled missing values, standardized text fields
2025-05-10 02:02:23 Data Enrichment 76364 24 Added hemisphere indicators, elevation categories, airport size, region groups

4. Data Analysis and Merging in R

Country-Level Analysis

# Summary statistics for numeric variables
airports_enriched %>%
  select(latitude_deg, longitude_deg, elevation_ft) %>%
  summary() %>%
  kable(caption = "Summary Statistics for Numeric Variables")
Summary Statistics for Numeric Variables
latitude_deg longitude_deg elevation_ft
Min. :-90.00 Min. :-179.88 Min. :-1266
1st Qu.: 11.99 1st Qu.: -94.07 1st Qu.: 310
Median : 35.13 Median : -69.80 Median : 730
Mean : 25.70 Mean : -28.90 Mean : 1195
3rd Qu.: 42.67 3rd Qu.: 23.84 3rd Qu.: 1283
Max. : 82.75 Max. : 179.98 Max. :17372
# Frequency tables for categorical variables
cat("\nAirport Types Distribution:\n")
## 
## Airport Types Distribution:
table(airports_enriched$type) %>%
  as.data.frame() %>%
  arrange(desc(Freq)) %>%
  kable()
Var1 Freq
Small_airport 39736
Heliport 19399
Closed 10798
Medium_airport 4753
Seaplane_base 1164
Large_airport 465
Balloonport 49
cat("\nScheduled Service Distribution:\n")
## 
## Scheduled Service Distribution:
table(airports_enriched$scheduled_service) %>%
  as.data.frame() %>%
  kable()
Var1 Freq
yes 4263
no 72101
# Create country-level summary
country_summary <- airports_enriched %>%
  group_by(iso_country) %>%
  summarise(
    total_airports = n(),
    avg_elevation = mean(elevation_ft, na.rm = TRUE),
    large_airports = sum(type == "Large_airport", na.rm = TRUE),
    medium_airports = sum(type == "Medium_airport", na.rm = TRUE),
    small_airports = sum(type == "Small_airport", na.rm = TRUE),
    scheduled_service_pct = mean(scheduled_service == "yes", na.rm = TRUE) * 100,
    avg_latitude = mean(latitude_deg, na.rm = TRUE),
    avg_longitude = mean(longitude_deg, na.rm = TRUE)
  ) %>%
  arrange(desc(total_airports))

# Display top 10 countries by airport count
country_summary %>%
  head(10) %>%
  kable(caption = "Top 10 Countries by Airport Count", digits = 2)
Top 10 Countries by Airport Count
iso_country total_airports avg_elevation large_airports medium_airports small_airports scheduled_service_pct avg_latitude avg_longitude
US 30581 1227.30 67 806 14932 1.66 38.43 -96.89
BR 6844 1382.83 8 125 4701 2.25 -16.29 -50.00
JP 3430 704.55 12 95 171 2.45 35.49 137.16
CA 3073 1104.78 13 329 1079 8.30 51.09 -96.76
AU 2576 680.80 6 185 1957 6.37 -27.23 139.20
MX 2288 2587.05 16 52 1378 3.54 24.81 -103.77
RU 1551 667.71 16 239 631 11.28 56.12 77.40
KR 1400 676.47 5 18 65 1.14 37.03 127.58
GB 1397 544.39 9 81 955 3.79 52.93 -1.68
DE 1037 868.81 10 63 763 3.38 50.66 9.99

Regional Analysis and Data Merging

# Create region-level summary
region_summary <- airports_enriched %>%
  group_by(region_group, iso_region) %>%
  summarise(
    airport_count = n(),
    avg_elevation = mean(elevation_ft, na.rm = TRUE),
    scheduled_service_pct = mean(scheduled_service == "yes", na.rm = TRUE) * 100,
    .groups = "drop"
  )

# Merge with original data for enhanced analysis
airports_merged <- airports_enriched %>%
  left_join(
    country_summary %>% select(iso_country, total_airports, scheduled_service_pct),
    by = "iso_country",
    suffix = c("", "_country")
  ) %>%
  left_join(
    region_summary %>% select(iso_region, airport_count, avg_elevation),
    by = "iso_region",
    suffix = c("", "_region")
  )

# Validate merge
cat("Merge validation:\n")
## Merge validation:
cat("Original rows:", nrow(airports_enriched), "\n")
## Original rows: 76364
cat("Merged rows:", nrow(airports_merged), "\n")
## Merged rows: 76364
cat("Merge successful:", nrow(airports_enriched) == nrow(airports_merged), "\n")
## Merge successful: TRUE

5. Communication of Findings

Visualization 1: Global Airport Distribution

Simple Analysis on Airport shows that there are maximum amount of heliport and closed airport in the united states as compared to any of the continent in the world. south america has basically the layout of bollon port and mix of medium size airport. baloon port has also been observe in large number in Africa and Australia, Large Airports are basically laid out around all of the world.

# Create world map of airports
# if (!require(maps)) install.packages("maps")
world_map <- map_data("world")

ggplot() +
  geom_map(data = world_map, map = world_map,
           aes(x = long, y = lat, map_id = region),
           fill = "lightgray", color = "white") +
  geom_point(data = airports_enriched %>% 
               filter(!is.na(latitude_deg) & !is.na(longitude_deg)),
             aes(x = longitude_deg, y = latitude_deg, 
                 color = type, size = type),
             alpha = 0.6) +
  scale_color_brewer(palette = "Set1") +
  scale_size_manual(values = c(
    "Large_airport" = 3,
    "Medium_airport" = 2,
    "Small_airport" = 1,
    "Heliport" = 0.5,
    "Seaplane_base" = 0.5,
    "Closed" = 0.5,
    "Balloonport" = 0.5
  )) +
  labs(
    title = "Global Distribution of Airports by Type",
    subtitle = "Size and color indicate airport type",
    x = "Longitude",
    y = "Latitude",
    color = "Airport Type",
    size = "Airport Type"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")
## Warning in geom_map(data = world_map, map = world_map, aes(x = long, y = lat, :
## Ignoring unknown aesthetics: x and y

Visualization 2: Top Countries Analysis

# Top 15 countries by airport count
top_countries_plot <- country_summary %>%
  head(15) %>%
  ggplot(aes(x = reorder(iso_country, total_airports), y = total_airports)) +
  geom_col(aes(fill = scheduled_service_pct), width = 0.7) +
  geom_text(aes(label = total_airports), hjust = -0.1, size = 3) +
  scale_fill_gradient(low = "lightblue", high = "darkblue", 
                      name = "% with Scheduled Service") +
  coord_flip() +
  labs(
    title = "Top 15 Countries by Number of Airports",
    subtitle = "Color indicates percentage with scheduled service",
    x = "Country Code",
    y = "Number of Airports"
  ) +
  theme_minimal() +
  theme(legend.position = "right")

print(top_countries_plot)

Visualization 3: Airport Types by Elevation

Us, brazil, Canada, Japan and Australia are among the top 5 countries list in the count of number of airports in the world. but previously, we have seen that America has maximum heliport mainly, and out of these 5, two countries named brazil, australia has decent amount of balloonport, plus Japan and canada has mostly comprises of medium size airport. arpport distribution for canada and japan is similar but percent scheduled services in canada is much higher than in japan.

# Elevation distribution by airport type
elevation_plot <- airports_enriched %>%
  filter(elevation_ft < 10000 & elevation_ft > -500) %>%  # Remove extreme outliers
  ggplot(aes(x = type, y = elevation_ft, fill = type)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(alpha = 0.1, width = 0.2, size = 0.5) +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Elevation Distribution by Airport Type",
    subtitle = "Box plots with individual points shown",
    x = "Airport Type",
    y = "Elevation (feet)",
    fill = "Airport Type"
  ) +
  theme_minimal() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1))

print(elevation_plot)

Visualization 4: Hemisphere Analysis

We are going to visualize this on hemisphere level as well, and find some insights if on a global scale distribution of airport to find some insights. Northern Eastern regions show a balanced mix of airport types, with heliports and small airports each making up roughly 40–45% of facilities, and medium airfields contributing about 10%. Southern Eastern skies are overwhelmingly served by small airports (~78%), with closed stations and heliports sharing the remainder equally (around 10% each). In Northern Western areas, small airports drop to ~50% while closed facilities surge to 20% and heliports hover near 25%, reflecting harsher climates or lower demand. Southern Western airspace resembles Southern Eastern but with a slightly higher heliport presence (~25%) and fewer closed fields (~5%). Across all hemispheres, large airports and seaplane bases remain extremely rare (<2%), underscoring the predominance of smaller, more flexible airfields in global aviation infrastructure.

# Create hemisphere distribution plot
hemisphere_plot <- airports_enriched %>%
  count(hemisphere_ns, hemisphere_ew, type) %>%
  ggplot(aes(x = interaction(hemisphere_ns, hemisphere_ew), y = n, fill = type)) +
  geom_col(position = "fill") +
  scale_fill_brewer(palette = "Set3") +
  labs(
    title = "Airport Type Distribution by Hemisphere",
    subtitle = "Proportion of airport types in each hemisphere combination",
    x = "Hemisphere (North/South, East/West)",
    y = "Proportion",
    fill = "Airport Type"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(hemisphere_plot)

# Read CSV files and print column names to verify
navaids_df <- read_csv("data/navaids.csv") %>% 
  clean_names()
## Rows: 11020 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): filename, ident, name, type, iso_country, usage_type, power, associ...
## dbl (6): id, frequency_khz, latitude_deg, longitude_deg, elevation_ft, magne...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print("Navaids Columns:")
## [1] "Navaids Columns:"
print(names(navaids_df))
##  [1] "id"                     "filename"               "ident"                 
##  [4] "name"                   "type"                   "frequency_khz"         
##  [7] "latitude_deg"           "longitude_deg"          "elevation_ft"          
## [10] "iso_country"            "magnetic_variation_deg" "usage_type"            
## [13] "power"                  "associated_airport"
airports_df <- read_csv("data/airports.csv") %>% 
  clean_names()
## Rows: 76364 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): ident, type, name, continent, iso_country, iso_region, municipalit...
## dbl  (4): id, latitude_deg, longitude_deg, elevation_ft
## lgl  (3): x16, x17, x18
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
print("Airports Columns:")
## [1] "Airports Columns:"
print(names(airports_df))
##  [1] "id"                "ident"             "type"             
##  [4] "name"              "latitude_deg"      "longitude_deg"    
##  [7] "elevation_ft"      "continent"         "iso_country"      
## [10] "iso_region"        "municipality"      "scheduled_service"
## [13] "gps_code"          "iata_code"         "local_code"       
## [16] "x16"               "x17"               "x18"
# Display initial structure
glimpse(navaids_df)
## Rows: 11,020
## Columns: 14
## $ id                     <dbl> 85050, 85051, 85052, 85053, 85054, 85055, 85056…
## $ filename               <chr> "Williams_Harbour_NDB_CA", "Sable_Island_NDB_CA…
## $ ident                  <chr> "1A", "1B", "1CD", "1D", "1E", "1F", "1K", "1S"…
## $ name                   <chr> "Williams Harbour", "Sable Island", "Nanaimo", …
## $ type                   <chr> "NDB", "NDB", "DME", "NDB", "NDB", "NDB", "NDB"…
## $ frequency_khz          <dbl> 373, 277, 111450, 346, 349, 363, 227, 350, 1107…
## $ latitude_deg           <dbl> 52.5589, 43.9306, 49.0572, 52.7750, 53.4667, 47…
## $ longitude_deg          <dbl> -55.7822, -60.0229, -123.8720, -56.1240, -55.78…
## $ elevation_ft           <dbl> 70, NA, 83, 209, NA, 193, NA, NA, 2210, 25, 100…
## $ iso_country            <chr> "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA",…
## $ magnetic_variation_deg <dbl> -23.072, -19.100, 18.282, -23.163, -23.398, -19…
## $ usage_type             <chr> "LO", "LO", "LO", "LO", "LO", "LO", "LO", "LO",…
## $ power                  <chr> "MEDIUM", "MEDIUM", "LOW", "LOW", "LOW", "MEDIU…
## $ associated_airport     <chr> "CCA6", NA, "CYCD", "CCH4", "CCE4", "CZBF", "CF…
glimpse(airports_df)
## Rows: 76,364
## Columns: 18
## $ id                <dbl> 6523, 323361, 6524, 6525, 506791, 6526, 322127, 6527…
## $ ident             <chr> "00A", "00AA", "00AK", "00AL", "00AN", "00AR", "00AS…
## $ type              <chr> "heliport", "small_airport", "small_airport", "small…
## $ name              <chr> "Total RF Heliport", "Aero B Ranch Airport", "Lowell…
## $ latitude_deg      <dbl> 40.07099, 38.70402, 59.94773, 34.86480, 59.09329, 35…
## $ longitude_deg     <dbl> -74.93369, -101.47391, -151.69252, -86.77030, -156.4…
## $ elevation_ft      <dbl> 11, 3435, 450, 820, 80, 237, 1100, 3810, 3038, 87, 3…
## $ continent         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ iso_country       <chr> "US", "US", "US", "US", "US", "US", "US", "US", "US"…
## $ iso_region        <chr> "US-PA", "US-KS", "US-AK", "US-AL", "US-AK", "US-AR"…
## $ municipality      <chr> "Bensalem", "Leoti", "Anchor Point", "Harvest", "Kin…
## $ scheduled_service <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no"…
## $ gps_code          <chr> "K00A", "00AA", "00AK", "00AL", "00AN", NA, "00AS", …
## $ iata_code         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ local_code        <chr> "00A", "00AA", "00AK", "00AL", "00AN", NA, "00AS", "…
## $ x16               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ x17               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ x18               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
# Clean and transform navaids dataset
navaids_clean <- navaids_df %>%
  # Remove rows with missing critical information
  drop_na(ident, name, type) %>%
  # Standardize type to lowercase
  mutate(
    type = str_to_lower(type),
    # Convert frequency to numeric, handling potential issues
    frequency_khz = as.numeric(frequency_khz),
    # Safely handle power column if it exists
    power_category = if("power" %in% names(.)) {
      case_when(
        power <= 10 ~ "Low",
        power <= 50 ~ "Medium",
        power > 50 ~ "High",
        TRUE ~ "Unknown"
      )
    } else {
      "Unknown"
    }
  )

# Clean and transform airports dataset
airports_clean <- airports_df %>%
  drop_na(ident, name) %>%
  mutate(
    # Standardize airport type
    airport_type = str_to_lower(type),
    # Create airport size category
    airport_size = case_when(
      str_detect(airport_type, "international") ~ "International",
      str_detect(airport_type, "heliport") ~ "Heliport",
      str_detect(airport_type, "small") ~ "Small",
      TRUE ~ "Other"
    )
  )

Geographical Analysis

Navigational Aids type Vortac, vor is prominent in usa where heliport are the largest in number, DME is prominent in australia and russian region. DME is also prominent european countries, mix of vor and dme in western europe, africa is also has a mix of vor, tacan. southern asia has navigational aid types of vor, dme

library(dplyr)
library(sf)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(leaflet)

# 1. Convert to sf, ensure coords
navaids_sf <- navaids_clean %>%
  mutate(
    plot_longitude = if("longitude_deg" %in% names(.)) longitude_deg else NA_real_,
    plot_latitude  = if("latitude_deg"  %in% names(.)) latitude_deg  else NA_real_
  ) %>%
  filter(!is.na(plot_longitude) & !is.na(plot_latitude)) %>%
  st_as_sf(coords = c("plot_longitude", "plot_latitude"), crs = 4326)

# 2. Radius helper
calculate_marker_radius <- function(power_col) {
  if (is.null(power_col) || all(is.na(power_col))) {
    return(rep(3, length(power_col)))
  }
  power_numeric <- as.numeric(power_col)
  power_numeric[is.na(power_numeric)] <- 0
  pmax(power_numeric / 10, 3)
}

# 3. Add marker_radius
navaids_sf_with_radius <- navaids_sf %>%
  mutate(
    marker_radius = if("power" %in% names(.)) {
      calculate_marker_radius(power)
    } else {
      rep(3, nrow(.))
    }
  )
## Warning: There was 1 warning in `stopifnot()`.
## ℹ In argument: `marker_radius = if (...) NULL`.
## Caused by warning in `calculate_marker_radius()`:
## ! NAs introduced by coercion
# 4. Build color palette and map
pal <- colorFactor(palette = "viridis", domain = navaids_sf_with_radius$type)

leaflet(navaids_sf_with_radius) %>%
  addTiles() %>%
  addCircleMarkers(
    radius = ~marker_radius,
    color  = ~pal(type),
    stroke = FALSE, fillOpacity = 0.8,
    popup  = ~paste0("<strong>Name:</strong> ", name,
                     "<br><strong>Type:</strong> ", type)
  ) %>%
  addLegend(
    "bottomright",
    pal    = pal,
    values = ~type,
    title  = "Navaid Type",
    opacity = 1
  )

Statistical Summaries

# Navigation Aid Summary by Type
navaid_type_summary <- navaids_clean %>%
  group_by(type) %>%
  summarise(
    total_count = n(),
    mean_frequency = mean(frequency_khz, na.rm = TRUE),
    # Safely handle power column
    mean_power = if("power" %in% names(.)) {
      mean(power, na.rm = TRUE)
    } else {
      NA_real_
    }
  ) %>%
  arrange(desc(total_count))
## Warning: There were 7 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `mean_power = if (...) NULL`.
## ℹ In group 1: `type = "dme"`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 6 remaining warnings.
print(navaid_type_summary)
## # A tibble: 7 × 4
##   type    total_count mean_frequency mean_power
##   <chr>         <int>          <dbl>      <dbl>
## 1 ndb            6608           347.         NA
## 2 vor-dme        2602        114011.         NA
## 3 vortac          744        114068.         NA
## 4 tacan           442        113943.         NA
## 5 vor             308        113297.         NA
## 6 dme             166        112015.         NA
## 7 ndb-dme         137           322.         NA
# Critical Navigation Aids
# critical_navaids <- enriched_df %>%
#   filter(is_critical_navaid) %>%
#   select(any_of(c("ident", "name", "type", "frequency_khz", "power", "associated_airport")))

7. Magnetic Variation Insights

7.1 Magnetic Variation Analysis

# Basic analysis of magnetic variation
mag_var_data <- navaids_clean %>%
  filter(!is.na(magnetic_variation_deg))

# Create categories for magnetic variation
mag_var_data <- mag_var_data %>%
  mutate(
    variation_category = case_when(
      magnetic_variation_deg < -15 ~ "Strong West",
      magnetic_variation_deg < -5 ~ "Moderate West",
      magnetic_variation_deg < 5 ~ "Minimal",
      magnetic_variation_deg < 15 ~ "Moderate East",
      TRUE ~ "Strong East"
    )
  )

# Simple histogram of magnetic variation
ggplot(mag_var_data, aes(x = magnetic_variation_deg, fill = variation_category)) +
  geom_histogram(bins = 20) +
  theme_minimal() +
  labs(title = "Distribution of Magnetic Variation",
       x = "Magnetic Variation (degrees)", 
       y = "Count",
       fill = "Category")

# Magnetic variation by navaid type
ggplot(mag_var_data, aes(x = type, y = magnetic_variation_deg, fill = type)) +
  geom_boxplot() +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Magnetic Variation by Navigation Aid Type",
       x = "Navaid Type", 
       y = "Magnetic Variation (degrees)")

7.2 Geographical Visualization of Magnetic Variation

# Create a simple map of magnetic variation
# First, ensure we have coordinates
mag_var_map_data <- mag_var_data %>%
  filter(!is.na(latitude_deg), !is.na(longitude_deg))

# Convert to spatial data
mag_var_sf <- mag_var_map_data %>%
  st_as_sf(coords = c("longitude_deg", "latitude_deg"), crs = 4326)

# Create a simple leaflet map
leaflet(mag_var_sf) %>%
  addTiles() %>%
  addCircleMarkers(
    radius = 4,
    color = ~case_when(
      magnetic_variation_deg < -15 ~ "blue",
      magnetic_variation_deg < -5 ~ "lightblue",
      magnetic_variation_deg < 5 ~ "green",
      magnetic_variation_deg < 15 ~ "orange",
      TRUE ~ "red"
    ),
    popup = ~paste(
      "Name:", name, "<br>",
      "Type:", type, "<br>",
      "Mag Variation:", magnetic_variation_deg, "°"
    )
  ) %>%
  addLegend(
    position = "bottomright",
    colors = c("blue", "lightblue", "green", "orange", "red"),
    labels = c("Strong West", "Moderate West", "Minimal", "Moderate East", "Strong East"),
    title = "Magnetic Variation"
  )
# Create a simplified lat/long region classification
mag_var_map_data <- mag_var_map_data %>%
  mutate(
    region = case_when(
      longitude_deg < -120 ~ "West",
      longitude_deg < -90 ~ "Central",
      TRUE ~ "East"
    )
  )

# Plot magnetic variation by region
ggplot(mag_var_map_data, aes(x = region, y = magnetic_variation_deg, fill = region)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Magnetic Variation by Geographic Region",
       x = "Region", 
       y = "Magnetic Variation (degrees)")

Key Insights & Discussion

Finding: Sub‑Saharan Africa and the Pacific Islands have very few high‑power navaids (“High” power category) relative to the number of small and heliport airfields, while South America shows a moderate mix but still lags behind North America and Europe. Implication: Pilots operating in these regions face reduced redundancy—adding mid‑ or high‑power aids (e.g. VOR‑DME) would greatly improve safety and enable instrument approaches where none currently exist.

Finding: The complexity score (navaids × type diversity) correlates strongly with known hub traffic—major international airports (e.g. San Francisco Bay Airdrome with 17 navaids and 5 types) match their high passenger/cargo volumes, whereas some medium‑traffic airports (e.g. Stow Airstrip) are slightly over‑equipped relative to their usage. Implication: We can consider redistributing one or two lower‑utility aids from over‑equipped mid‑level airports to rapidly growing but under‑served fields, aligning infrastructure with actual demand.

Finding: Heliports and “Other” small‑field airports almost exclusively use LF bands (~100–120 kHz), even in mountainous Western regions where VHF (108–117 MHz) would offer better line‑of‑sight performance. Implication: Re‑allocating some small‑field aids to VHF (e.g. adding miniature VOR or VOR‑DME) in rugged terrain could reduce signal blockage, improving reliability and reducing maintenance from static interference.

Finding: NDBs and TACANs in the Eastern region show the widest spread of magnetic variation (±50°+ outliers), while NDB‑DMEs cluster tightly near zero variation. Western aids (mostly VOR‑DME/VORTAC) uniformly sit in moderate east variation (10–30°). Implication: Immediate recalibration should focus on single‑mode NDBs/TACANs in high‑variation zones (e.g. northern Canada), whereas mixed‑mode installations can tolerate broader shifts but still benefit from periodic checks.

Finding: Top‑traffic airports have at least 3 alternative high‑power aids within 50 km, but several medium hubs have only a single aid of each type. Implication: For those single‑aid airports, emergency plans must designate specific diversion fields (with overlapping coverage) or install an extra DME/VOR‑DME to guarantee continuous guidance if one beacon fails.

Finding: Legacy NDBs account for ~70% of all high‑power aids, particularly in developing regions, whereas newer VOR‑DMEs and VORTACs are concentrated in North America and Europe. Implication: Prioritize replacing isolated NDBs with VOR‑DME or satellite RNAV on routes serving commercial traffic; this will reduce maintenance and provide distance‑and‑bearing in one unit.

Finding: Military and hybrid fields (VORTAC‑equipped) are almost exclusively in the Western hemisphere, while civilian heliports rely on low‑power NDBs and occasional VORs. Commercial hubs mix VOR‑DME with VORTAC for redundancy. Implication: Procurement should follow role: military bases keep TACAN/VORTAC, remote heliports can continue using low‑power NDBs, and commercial fields focus on mixed VOR‑DME/VORTAC installations.

Finding: Desert and polar regions (High Elevation, Southern Western and Northern Western hemispheres) show under‑deployment of VOR/VOR‑DME—fields there still depend heavily on NDBs despite line‑of‑sight challenges. Implication: Installing more medium‑range VOR‑DME in these challenging terrains will improve all‑weather access and reduce weather‑related cancellations.

Session Information

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.4.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] leaflet_2.2.2   sf_1.0-19       DT_0.33         knitr_1.48     
##  [5] skimr_2.1.5     janitor_2.2.1   lubridate_1.9.4 forcats_1.0.0  
##  [9] stringr_1.5.1   dplyr_1.1.4     purrr_1.0.2     readr_2.1.5    
## [13] tidyr_1.3.1     tibble_3.2.1    ggplot2_3.5.1   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       xfun_0.47          bslib_0.8.0        htmlwidgets_1.6.4 
##  [5] tzdb_0.4.0         crosstalk_1.2.1    vctrs_0.6.5        tools_4.4.1       
##  [9] generics_0.1.3     parallel_4.4.1     proxy_0.4-27       fansi_1.0.6       
## [13] highr_0.11         pkgconfig_2.0.3    KernSmooth_2.23-24 RColorBrewer_1.1-3
## [17] lifecycle_1.0.4    compiler_4.4.1     farver_2.1.2       munsell_0.5.1     
## [21] repr_1.1.7         snakecase_0.11.1   class_7.3-22       htmltools_0.5.8.1 
## [25] maps_3.4.2.1       sass_0.4.9         yaml_2.3.10        pillar_1.9.0      
## [29] crayon_1.5.3       jquerylib_0.1.4    classInt_0.4-11    cachem_1.1.0      
## [33] tidyselect_1.2.1   digest_0.6.37      stringi_1.8.4      labeling_0.4.3    
## [37] fastmap_1.2.0      grid_4.4.1         colorspace_2.1-1   cli_3.6.3         
## [41] magrittr_2.0.3     base64enc_0.1-3    utf8_1.2.4         e1071_1.7-16      
## [45] withr_3.0.1        scales_1.3.0       bit64_4.6.0-1      timechange_0.3.0  
## [49] rmarkdown_2.28     bit_4.5.0.1        hms_1.1.3          evaluate_0.24.0   
## [53] viridisLite_0.4.2  rlang_1.1.6        Rcpp_1.0.13        glue_1.8.0        
## [57] DBI_1.2.3          rstudioapi_0.17.1  vroom_1.6.5        jsonlite_2.0.0    
## [61] R6_2.5.1           units_0.8-5